{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train = pd.read_csv('../data/train_drop_col_row.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cust_id</th>\n",
       "      <th>cust_group</th>\n",
       "      <th>y</th>\n",
       "      <th>x_1</th>\n",
       "      <th>x_2</th>\n",
       "      <th>x_3</th>\n",
       "      <th>x_4</th>\n",
       "      <th>x_5</th>\n",
       "      <th>x_6</th>\n",
       "      <th>x_7</th>\n",
       "      <th>...</th>\n",
       "      <th>x_148</th>\n",
       "      <th>x_149</th>\n",
       "      <th>x_150</th>\n",
       "      <th>x_151</th>\n",
       "      <th>x_152</th>\n",
       "      <th>x_153</th>\n",
       "      <th>x_154</th>\n",
       "      <th>x_155</th>\n",
       "      <th>x_156</th>\n",
       "      <th>x_157</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>110000</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.354167</td>\n",
       "      <td>0.604988</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>-99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>110001</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.125000</td>\n",
       "      <td>0.012058</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>110002</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.565979</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>110003</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.208333</td>\n",
       "      <td>0.316209</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>110004</td>\n",
       "      <td>group_3</td>\n",
       "      <td>0</td>\n",
       "      <td>0.208333</td>\n",
       "      <td>0.008061</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 121 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   cust_id cust_group  y       x_1       x_2  x_3  x_4  x_5  x_6  x_7  ...    \\\n",
       "0   110000    group_3  0  0.354167  0.604988  -99  -99  -99  -99  -99  ...     \n",
       "1   110001    group_3  0  0.125000  0.012058  -99  -99  -99  -99  -99  ...     \n",
       "2   110002    group_3  0  0.333333  0.565979    0    0    0    0    0  ...     \n",
       "3   110003    group_3  0  0.208333  0.316209    0    0    0    0    1  ...     \n",
       "4   110004    group_3  0  0.208333  0.008061  -99  -99  -99  -99  -99  ...     \n",
       "\n",
       "   x_148  x_149  x_150  x_151  x_152  x_153  x_154  x_155  x_156  x_157  \n",
       "0      1      1      1      1      1      1      1      1      3    -99  \n",
       "1      1      1      1      1      1      1      1      1      2      2  \n",
       "2      1      1      2      1      1      1      1      1      2      2  \n",
       "3      2      1      1      1      1      1      1      1      2      4  \n",
       "4      1      1      1      1      1      1      1      1      2      1  \n",
       "\n",
       "[5 rows x 121 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "test = pd.read_csv('../data/train_x_drop_col_row.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['cust_id', 'cust_group', 'x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6',\n",
       "       'x_7', 'x_8', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14',\n",
       "       'x_15', 'x_16', 'x_17', 'x_18', 'x_19', 'x_20', 'x_21', 'x_22',\n",
       "       'x_23', 'x_24', 'x_25', 'x_26', 'x_27', 'x_28', 'x_29', 'x_30',\n",
       "       'x_31', 'x_32', 'x_33', 'x_34', 'x_35', 'x_36', 'x_37', 'x_38',\n",
       "       'x_39', 'x_40', 'x_41', 'x_42', 'x_43', 'x_44', 'x_45', 'x_46',\n",
       "       'x_47', 'x_48', 'x_49', 'x_50', 'x_51', 'x_52', 'x_53', 'x_54',\n",
       "       'x_55', 'x_56', 'x_57', 'x_58', 'x_59', 'x_60', 'x_61', 'x_62',\n",
       "       'x_63', 'x_64', 'x_65', 'x_66', 'x_67', 'x_68', 'x_69', 'x_70',\n",
       "       'x_71', 'x_72', 'x_73', 'x_74', 'x_75', 'x_76', 'x_77', 'x_78',\n",
       "       'x_79', 'x_80', 'x_81', 'x_82', 'x_83', 'x_84', 'x_85', 'x_86',\n",
       "       'x_87', 'x_88', 'x_89', 'x_90', 'x_91', 'x_93', 'x_95', 'x_96',\n",
       "       'x_97', 'x_98', 'x_99', 'x_100', 'x_101', 'x_139', 'x_140',\n",
       "       'x_141', 'x_142', 'x_143', 'x_144', 'x_145', 'x_146', 'x_147',\n",
       "       'x_148', 'x_149', 'x_150', 'x_151', 'x_152', 'x_153', 'x_154',\n",
       "       'x_155', 'x_156', 'x_157'], dtype=object)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.columns.values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(13602, 103)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iv_feature = ['x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19', 'x_20', 'x_21', 'x_22', 'x_23', 'x_24', 'x_25', 'x_26', 'x_27', 'x_28', 'x_29', 'x_30', 'x_31', 'x_32', 'x_33', 'x_34', 'x_35', 'x_36', 'x_37', 'x_38', 'x_39', 'x_40', 'x_41', 'x_42', 'x_43', 'x_44', 'x_45', 'x_46', 'x_47', 'x_48', 'x_50', 'x_51', 'x_52', 'x_53', 'x_54', 'x_55', 'x_56', 'x_57', 'x_58', 'x_59', 'x_60', 'x_61', 'x_62', 'x_63', 'x_64', 'x_65', 'x_66', 'x_67', 'x_68', 'x_69', 'x_70', 'x_71', 'x_72', 'x_73', 'x_74', 'x_75', 'x_76', 'x_77', 'x_78', 'x_79', 'x_80', 'x_81', 'x_82', 'x_83', 'x_84', 'x_85', 'x_86', 'x_87', 'x_88', 'x_90', 'x_97', 'x_98', 'x_99', 'x_100', 'x_101', 'x_139', 'x_140', 'x_141', 'x_142', 'x_143', 'x_144', 'x_149', 'x_150', 'x_153', 'x_154', 'x_155', 'x_157']\n",
    "x_train = train[iv_feature]\n",
    "x_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(8122, 103)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "iv_feature = ['x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14', 'x_15', 'x_16', 'x_17', 'x_18', 'x_19', 'x_20', 'x_21', 'x_22', 'x_23', 'x_24', 'x_25', 'x_26', 'x_27', 'x_28', 'x_29', 'x_30', 'x_31', 'x_32', 'x_33', 'x_34', 'x_35', 'x_36', 'x_37', 'x_38', 'x_39', 'x_40', 'x_41', 'x_42', 'x_43', 'x_44', 'x_45', 'x_46', 'x_47', 'x_48', 'x_50', 'x_51', 'x_52', 'x_53', 'x_54', 'x_55', 'x_56', 'x_57', 'x_58', 'x_59', 'x_60', 'x_61', 'x_62', 'x_63', 'x_64', 'x_65', 'x_66', 'x_67', 'x_68', 'x_69', 'x_70', 'x_71', 'x_72', 'x_73', 'x_74', 'x_75', 'x_76', 'x_77', 'x_78', 'x_79', 'x_80', 'x_81', 'x_82', 'x_83', 'x_84', 'x_85', 'x_86', 'x_87', 'x_88', 'x_90', 'x_97', 'x_98', 'x_99', 'x_100', 'x_101', 'x_139', 'x_140', 'x_141', 'x_142', 'x_143', 'x_144', 'x_149', 'x_150', 'x_153', 'x_154', 'x_155', 'x_157']\n",
    "x_test = test[iv_feature]\n",
    "x_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "iv_feature = ['x_3', 'x_4', 'x_5',\n",
    "       'x_6', 'x_7', 'x_8', 'x_9', 'x_10', 'x_11', 'x_12', 'x_13', 'x_14',\n",
    "       'x_15', 'x_16', 'x_17', 'x_18', 'x_19', 'x_20', 'x_21', 'x_22',\n",
    "       'x_23', 'x_24', 'x_25', 'x_26', 'x_27', 'x_28', 'x_29', 'x_30',\n",
    "       'x_31', 'x_32', 'x_33', 'x_34', 'x_35', 'x_36', 'x_37', 'x_38',\n",
    "       'x_39', 'x_40', 'x_41', 'x_42', 'x_43', 'x_44', 'x_45', 'x_46',\n",
    "       'x_47', 'x_48', 'x_50', 'x_51', 'x_52', 'x_53', 'x_54', 'x_55',\n",
    "       'x_56', 'x_57', 'x_58', 'x_59', 'x_60', 'x_61', 'x_62', 'x_63',\n",
    "       'x_64', 'x_65', 'x_66', 'x_67', 'x_68', 'x_69', 'x_70', 'x_71',\n",
    "       'x_72', 'x_73', 'x_74', 'x_75', 'x_76', 'x_77', 'x_78', 'x_79',\n",
    "       'x_80', 'x_81', 'x_82', 'x_83', 'x_84', 'x_85', 'x_86', 'x_87',\n",
    "       'x_88', 'x_90', 'x_97', 'x_98', 'x_99', 'x_100', 'x_101', 'x_139',\n",
    "       'x_140', 'x_141', 'x_142', 'x_143', 'x_144', 'x_149', 'x_150',\n",
    "       'x_153', 'x_154', 'x_155', 'x_157']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x1f6f2478e80>"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPkAAADuCAYAAAD7nKGzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAGAxJREFUeJzt3XmUFOW9xvHv2z07+7CDSCHIIhpU\nwDWiVzSoZVCWG0WMejV6olFjzI1Wco2XXG+0TDTXGKPRRI27aIxLKI4ScWENIioKsmMpoOzDbDBL\nd9f9o1oFHKCH6Z63qvr3OaePh2G6+xmc57xvVVe9r/I8DyFEdMV0BxBC5JaUXIiIk5ILEXFSciEi\nTkouRMRJyYWIOCm5EBEnJRci4qTkQkSclFyIiJOSCxFxUnIhIk5KLkTEScmFiDgpuRARJyUXIuKk\n5EJEnJRciIiTkgsRcVJyISJOSi5ExEnJhYg4KbkQESclFyLipORCRJyUXIiIk5ILEXFSciEiTkou\nRMRJyYWIuALdAUTrMCynG9ANKAVK9vGIATuBGqAaqAK2AVtd26zUEFtkgZL9yaPBsJw2wBCgH2Ds\n9egLtGnhWzQCm4BVwLL0YzmwzLXNDS18bZFDUvKQMiynP3Bi+nEScBQQ1xSnClgBLAVmAzNd2/xU\nUxaxFyl5CBiWo4DhwGj8Qp+AP/UOsk+AN4E3gDdc2/xCc568JSUPKMNyioDTgfOA7wK99SZqsRXA\nDOBZ1zbn6Q6TT6TkAWJYThy/2BcC44BOehPlzFrgKeAJ1zZX6Q4TdVLyADAspzfwI+AKgj8Nz7Z3\ngCfxR/gtusNEkZRcI8NyTgJ+DIxHPs5MAK8Ad8t0Pruk5K0sfax9AXA9MEJznKCaD9wFvOTaZkp3\nmLCTkrcSw3JK8UftHwM9NMcJi+XAr4FnXNtM6g4TVlLyHDMsJwZcCtxG+M+Q67Ia+F/8E3UysjeT\nlDyHDMs5C/gN/oUqouXeBa51bXOB7iBhIiXPAcNyjsEv9xm6s0SQBzwG3Oza5mbdYcJASp5FhuV0\nBH4HXAYovWkirxKYAtzn2mZCc5ZAk5JniWE5Y4CHkePu1rYUfwr/lu4gQSUlbyHDctoCdwNX6c6S\nxzzgHsBybbNBd5igkZK3gGE5pwKP4t/eKfR7D7hQLpXdk5T8IKQ/874D/4IWOfYOlhrgOtc2/6o7\nSFBIyZvJsJx+wIvAMN1ZxH49DVzt2maV7iC6ScmbwbCcM4CpQLnuLCIja4Hvuba5SHcQnWQhxwwZ\nlvNT4FWk4GFyGDDLsJxzdQfRSUbyAzAspwC4H7hSdxZx0JLAj1zbfFB3EB2k5PthWE4H4HngTN1Z\nRFbc4drmL3SHaG1S8n0wLKcrMBO57jxqngQud22zUXeQ1iIlb0K64G8AR+rOInJiJjA+X868S8n3\nIgXPG4uA0fmwaYScXd+NFDyvDAdeTV+WHGlS8rR0wd9ECp5PTgAcw3LKdAfJJSk5exR8qO4sotWN\nAv5uWE6h7iC5kvclT1+H7iAFz2djgL+md6qJnLwvOfAIMFJ3CKHdRfgLfkROXpfcsJxb8HcrEQLg\nBsNyrtYdItvy9iM0w3LGAS8gt4qKPTUAp7i2+Y7uINmSlyU3LGcYMJeW79ktomkdcKxrm1t1B8mG\nvJuuG5bTDX87Him42Jc+wNPpNfNDLxI/RKbS/9OeAw7VnUUE3pnAr3SHyIa8KjlwA3Cq7hAiNP7L\nsBxTd4iWyptjcsNyhuAv9FeiO4sIlQrgaNc2P9Md5GDlxUieXvjhMaTgovk6AQ/oDtESeVFy4OfI\nBS/i4J1jWM4FukMcrMhP19P7ki0AInttsmgVm4Ahrm1W6A7SXJEeyQ3LKQIeRwouWq478FvdIQ5G\npEsO/Ay5dVRkz+XpXXNCJbLTdcNyegErkYteRHatBL7l2ma97iCZivJIfgdScJF9AwFLd4jmiORI\nbljOcGAhcvOJyI1qwHBtc7vuIJmI6kh+J1JwkTvt8M/3hELkRnLDckYDr+vOISKvFujn2uYW3UEO\nJIoj+R26A4i80Aa4WXeITERqJE/fTDBNdw6RN3YBh7m2uVF3kP2J2kh+g+4AIq+U4l8yHWiRGckN\nyxkKLNGdQ+SdeqC/a5sbdAfZlyiN5NfrDiDyUjEQ6MUfIzGSG5ZTDqzHnz4J0do2AX2CulNqVEby\nK5GCC326AxN0h9iX0Jc8vSDEj3TnEHkvsFP20JccOB9/dU0hdBplWM4A3SGaEoWSX6w7gBBpl+kO\n0JRQn3hL7y29BVm7TQTDeqCva5sp3UF2F/aR3EQKLoLjEOA03SH2FvaSB/aMpshb5+oOsLfQljy9\nr/g5unMIsZfAbcYQ2pIDZyErv4jgGWhYTn/dIXYX5pJP1B1AiH0I1AwzlCVPXwATuGMfIdICNWUP\nZcmBYUB73SGE2IdTDcsp0x3iS2Et+Ym6AwixHyXAaN0hvhTWkp+kO4AQByAlbyEZyUXQHas7wJdC\nd1mrYTk9gc915xDiAKqAjq5tai9YGEdyGcVFGLQHDtMdAjSWXCl1llJqhVJqtVKqOdvOyPG4CItj\ndAcATSVXSsWBPwJnA0cAk5RSR2T49ONzFkyI7MrfkgPHAas9z1vreV4D8CxwXobPHZy7WEJkVV6X\nvDewbrc/r09/bb8My+kAdMlVKCGy7GjdAUBfyZvajDCTs5CHZzuIEDnU07CcTrpD6Cr5evZcl+0Q\nMvtYLJBraAmxH710ByjQ9L4LgcOVUv2ADcCFwEUZPO/QbLx51bsvU7P4NfCg7bAxtB95HjvmPEXN\n4teIlXUAoNOoSyjtPzKj5wI0bFrLttf+iJdsQMXilJ95NcW9BlG7Yi6Vs58iVtqWruNvIV7ansaK\nL9gx63G6nheK/fJEy/QAluoMoKXknucllFLXAq8BceARz/My+Ydo8aqsDVtcaha/Ro9LfoeKF7L5\nuVsp7T8CgHYjzqfD8eOb/dzC8t5UvPUoHU+eRGn/Eexas5CKtx6lx0U21e+8SI/v30XtslnUfvw2\n7Yd/lx2zn6DjKbL+ZJ7oqTuAts/JPc+b7nneQM/z+nue9+sMn3ZIS9+3cdt6insNJlZYgorFKe5z\nJDtXzc/Kc1MNO/3/1u8k3raz/0UVw0s24iXqUbE4deuWEG/TicLyA55nFNHQQ3eAsF3x1uJmFHXp\nS926JSR3VZFqrGPX2ndJVm0FoPq9aXz+yLVsnX4PybqaZj23fPRVVLz5KOvvv4yKNx+m06mXAtDh\n5Elsfu5W6twPaHPEqVTOm0qHkye19McQ4aG95KG6dt2wnOXAoJa+TvXiGdS876AKSyjs0gdVUEyH\nEyYSK20PSrFj9pMka7bT5Zxv7oTc1HPLR1/J9tcfpLjPkbQZdDK1y2ZTs/hVul+45wSl5qOZpOpr\nKO45iKp3/k6spC2dzriKWKEsOBthT7m2qfXYLGwjeVba0G7Yd+h52e/pMflOYiXtKOzUi3ibTqhY\nHKVitBs2hoYvVmb8XPALXDbQv+K2bPC3qd/r+anGOmqWzKTdMSYVsx6j8zk3UNRjALVL38rGjySC\nK3+PyQ9SVkqerN0BQKJqMztXzqfsiFNJ1Gz/6u93rpxPYZe+GT8XIN62nPp1HwFQ9+nir8r/paoF\nL9B+xFhUvACvscH/oorhJeqz8SOJ4OqqO4Cuj9AOVlZKvuWl20ntqoZYnPIzf0i8pC1bp91Nw6a1\noBQFHbpRPuZaABLV29j26r10//df7fO5AJ3Pvo6K1x/CSyVRBUWUn3XdV++XqN5Gw8bVdPz2ZADa\nHzeOjU/8J7GSNnQdf0s2fiQRXEW6A4TtmLyeAPyjCdEMa1zb1HoR1wGn60qpa5VS2i/NMyxHIQUX\n4aN9tpxJgB7AQqXUe8AjwGuenuFfTkHn0EC1zj079s76dmpnoDbrC7tGCnboXqE5o+m6UkoB3wH+\nAxgBPAc87HnemtzG+5phOR2BitZ6v3xUSKLhlNiHyyfEZ1ecFFvSqSO1g5WS2VMLuUyp7KczQEZT\nCc/zPKXURmAjkAA6AX9TSv3T87ybchlwN42t9D55q5GCojdSx37rjZS/BmEp9TtHx977aEJ8Vs3I\n2IqubagbrFToPpHRTfvv7QFHcqXU9cClwFbgL8BLnuc1KqViwCrP81pt3yfDchqAwtZ6P7Gn9tRU\nnh1/Z8W4+Ny6YWp1zxIaByjV5G3DWZNMeYz4cy2928WYdtGe+xXUJzwueWkXiz5P0rlMMXViGUbH\nGP9ck8CaWUdDEori8NszSzi9XwH1CY/znt3J+iqPa0YWcc1If5Jy1T92cfWIIo7pGc/Fj/AxUyqH\n5uKFM5XJSN4FGO953qe7f9HzvJRSqrW3KqpEFo3Qpoq2HaYmTz9uavJ0ALqwY8vY+PzVY+PzEkco\nt2+RSmblLsHd/X5BA0O6xKhq4nKCh99vpFOJYvX17Xh2SSM3v17H1IlldClT/GNSGb3axViyOcmY\nJ3ey4cZ2vLYmwfCecaZPLubYB2u5ZmQRizcmSXnkquAADbl64UwdsOSe5926n79blt04B7QDKXlg\nbKVj10eSZ3d9JHk2AH3U5g3nx+a4ZnwBh6sNh8VVqkVXe62vSuGsSvBfpxTzu/nf7MrLKxqZcmox\nABOPKODa6XV4nrdHYYd2jVGX8Ef9whjsSkBit1OLv3yznj+dm9Nzutqn69pP7zfTDt0BxL6t87r1\n/kNyfO8/JP3bdQeqde64+Jx1Z8UWFvRVmwbGlNe5Oa93w6t1/OaMEqobmj6k3FDl0aeDf4qgIKbo\nUALbdnl0Kfv6COKFZQmO6RGjuEBxZv8CnviwkeP/UstNJxfzyopGhveM06tdTk8zVObyxTMhJRc5\ns9LrY9yZmGTcySTA84apNSsnxmd9MTr+fllPtg1Sat+bVk5b2Ui3NorhveK85Saa/J6mqr/7CYKl\nm5Pc/HodMy72t7EviCmenuAf1zcmPcY8uZNXJpVx42t1fFaZ4pJhhYwdlPVTPhuy/YLNJSUXrUSp\nxd6AgYsTAwb+MgExUsnjY8uWTozP2joq9mG7LlQOUYrSL7977mdJXlmRYPqqauoSUFXvcfHfd/Hk\n+K++hUPaK9ZVpjikfYxEyqOyDspL/Zqvr0oxbuouHj+/lP7l3xyp71/YwKXDCpm/LklRHKZOLOXE\nh2u1lVwp9Qj+dtybPc87MpsBpORCixSx+PzU0KHzU/6J50ISDaNiixdPiM+uODm2tPPtoxl8xxkl\nhQBvuQnumtewR8EBxg4s5LHFjZzYp4C/fZzg9H5xlFLsqPMwn97JHaOLOfnQb/6KV+zymLYqwYyL\ny3hlRYKYAqWgrukJQ0tlOpL/FbgPeDzbAcJWcu1TH5EbjRQUzUwNHzYzNRyAMupqz4gt+nB8fHbN\n9saPDI+GvgC3vlnHiF5xxg4q5IpjC/n+iwkG3FtNeani2Yn+VPy+dxpYvT3FbbPquW2Wf1p+xvfL\n6NbGH9H/5+16bjmlGKUUYwYU8MeFDRz1QC0/HJ6T634y+p31PG+WUsrIRYCw3aByIfCM7hyi9bWn\nptKML1hxfnxu3TC1pleJagzLyr0jmFK5KJNvTJd8Wr5P15frDiD0qKJth2eSo497Julv+92Nii1j\n4/NWjY3PSw5Rn/UtzMFn9FmyWneAsI3kpUAtTW/OIPLYoWrT+vRn9GqA2tA/rjzta6sBm5lS2T3T\nb87VSB6qkgMYlvMJYOjOIYJtkPrskwnx2evGxBYW9VFbBsaUV64hxhymVJ6S6TfLdP1ry5GSiwNY\n4R3a7/bE5H63MxnwvGPU6pUT0p/R92D7YKVo1woxml4osAlKqWeA04AuSqn1wH97nvdwNkKEteRn\n6Q4hwkSp973DB76fOHzgLQmIk0ycEFu2dEJ81pZRsQ87dKZqiFI5Wa8g45J7npezdbrDWnIhDlqS\neMHc1JFD56b8WXERjfWnxRZ/MD4+q/LE2Mfl7dk5WKms3O34XhZeo8XCWPL3dQcQ0dJAYfGM1Iij\nZ6T87bLasKvmzNiixePic2pHxFZ0L6N+4EHcR58EMtuaJ8fCeOKtAP/Ktza6s4j80JHqCjO+YOX5\n8bn131JrexerxkzWUFjElMoROQ+XgdCVHMCwnNeB0bpziPzUne2bz4vPW/3d+PzUYP8z+qY24ryH\nKZU/afVwTQjjdB1gNlJyockmyrs9lDy320NJf82Uvmrj+nHxOZ+YsQXxw9Tn/ePK6w7M0Zvya2Ed\nyUcBb+vOIURThqhPV/+gYPqJE26btlV3FgjfNklfmg98c9tRIQJgmde3OigFh5CW3LXNRuAt3TmE\n2IcZugPsLpQlTwvUP6QQuwnU72aYS/4KTa8AJIROOwjQSTcIccld2/wUmKs7hxB7ed61Te3LMO8u\ntCVPe1J3ACH28oTuAHsLe8mfIwCL1wuR5hKwqTqEvOSubVYA03XnECLtKdc2A3eeKNQlT5MpuwiK\nQP4uRqHk05ClmoV+77q2GcjboENfctc264HndecQeS9wJ9y+FPqSp92LfGYu9KkmB5siZEskSu7a\n5hLkBJzQ50+ubQb2kDESJU+7Q3cAkZfqgf/THWJ/IlNy1zbnEsDPKEXkPeba5he6Q+xPZEqeZusO\nIPJKEviN7hAHEqmSu7bpAB/pziHyxvOuba7RHeJAIlXytDt1BxB5IxQzxyiW/FngY90hROS97Nrm\nYt0hMhG5kru2mQSu151DRFodcKPuEJmKXMkBXNucCbygO4eIrLtc21yrO0SmIlnytBuBnbpDiMj5\nFLhdd4jmiGzJXdv8jJCcGBGh8lPXNnfpDtEckS152m+BT3SHEJHxT9c2Q3cYGOmSu7ZZBwRiqxoR\neo2E9IRupEsO4Nrmy8DLunOI0LsrqPeLH0jkS552JbBJdwgRWu8DU3SHOFh5UXLXNrcAV+jOIUJp\nFzA5aMssN0delBy+uq79Ad05ROjc5NrmMt0hWiJvSp52IxCKSxFFILzo2uZ9ukO0VF6VPH22/Xv4\ny/UIsT+fAJfrDpENeVVyANc2VwJX6c4hAq0B+F6Ql3RqjrwrOYBrm88iy0WJfbvKtc13dYfIlrws\nOYBrm78goIvhC61udW3zMd0hsilvS552OfCG7hAiMB5ybfM23SGyLa9L7tpmIzAeWKI7i9DOAa7R\nHSIXlOfJngSG5RwC/AvorTuL0GIh8G+ubdbqDpILUvI0w3KOwl/Sub3uLKJVrQFOcm1zs+4guZLX\n0/Xdubb5EXA2UKk7i2g1LvCdKBccpOR7cG1zHnAasEVzFJF7y4Bvh2kZp4MlJd+La5sfAKcA63Vn\nETmzCBjl2uYG3UFag5S8Ca5trgC+DazWnUVk3SzgdNc2t+oO0lqk5Pvg2uan+CO67MgSHQ5wlmub\nVbqDtCYp+X64trkROBWYpzuLaLFngHFhW4QxG+QjtAwYllOEvz1tJC+WiLgk8AvXNgO/MWGuSMmb\nwbCci4EHgTLdWURGNgEXuLb5tu4gOknJmyl90cwLwOG6s4j9moN/u2ig9w5vDXJM3kzpi2ZGAi/p\nziL26R78y1TzvuAgI/lBMyxHATcBtwGFmuMIXzVwhWubz+sOEiRS8hYyLGcY8ChwjO4see5V/MUe\n1ukOEjRS8iwwLKcAuBn4JVCsOU6+qQB+ErWFHrJJSp5FhuUMAu4HTtedJU88hb8BoWycsR9S8hww\nLGcycDfQXXeWiFoOXOPa5pu6g4SBlDxHDMvpgL/O+w3IPerZshF/O+oHwryjSWuTkueYYTnlwM+A\n64A2muOE1WbgTvxy591lqS0lJW8lhuV0xT85dw1QqjlOWGzF32P+Ptc2d+oOE1ZS8lZmWE4P4Of4\nO61K2Zu2Hf+cxr2ubdboDhN2UnJNDMvpCFyMv5vLUZrjBMVs4M/A32Ranj1S8gAwLOcE/LJfQP7d\n/LIVeAz4i2uby3WHiSIpeYAYltMemAz8ADhWc5xcSuFvavFn4CU5U55bUvKAMiznUOCc9GM04R/h\ntwMzgOnAq65tymKZrURKHgKG5RTjryJrph+HaQ2UuQ/wSz0d+Jdrm0nNefKSlDyEDMs5HDgB/5bX\n44Cj0X/NfDX+KqgL04+5rm1+rjeSACl5JKRvkBmIf5b+KGAocAjQA//S2mzdCluPv9rKRvyNCT7C\n30duCbDWtc1Ult5HZJGUPOLS9713Bnril74nfvFL9vpWtdefq/HL/NXDtc2K3KYVuSAlFyLiZPkn\nISJOSi5ExEnJhYg4KbkQESclFyLipORCRJyUXIiIk5ILEXFSciEiTkouRMRJyYWIOCm5EBEnJRci\n4qTkQkSclFyIiJOSCxFxUnIhIk5KLkTEScmFiDgpuRARJyUXIuKk5EJEnJRciIiTkgsRcVJyISJO\nSi5ExEnJhYg4KbkQESclFyLipORCRJyUXIiI+3/0gHbImd3ktwAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "train['y'].value_counts().plot.pie(autopct = '%1.2f%%')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "Y_train = train['y']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(13602,)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Y_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(21724, 103)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x = pd.concat([x_train,x_test])\n",
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for i in range(96,158):\n",
    "    col = 'x'+'_'+str(i)\n",
    "    if col in x.columns.values:\n",
    "        dummies_df = pd.get_dummies(x[col]).rename(columns=lambda x: col + str(x))\n",
    "        x = pd.concat([x, dummies_df], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(21724, 158)"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(13602, 158)\n",
      "(8122, 158)\n"
     ]
    }
   ],
   "source": [
    "train_X = x[0:13602]\n",
    "test_X = x[13602:21724]\n",
    "print(train_X.shape)\n",
    "print(test_X.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "from sklearn.ensemble import ExtraTreesClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.neighbors import NearestNeighbors\n",
    "from sklearn.svm import SVC\n",
    "from sklearn import metrics  #accuracy_score,recall_score,f1_score\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import precision_recall_fscore_support\n",
    "from sklearn.utils.multiclass import unique_labels\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn import metrics\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.semi_supervised import label_propagation\n",
    "from sklearn import model_selection\n",
    "from sklearn.cross_validation import cross_val_score\n",
    "from lightgbm import LGBMClassifier\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV, KFold\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn import linear_model\n",
    "\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, BatchNormalization, Dropout, Reshape, Flatten, MaxPool2D\n",
    "from keras.layers.convolutional import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from keras.optimizers import RMSprop, Adam\n",
    "from keras.callbacks import ReduceLROnPlateau\n",
    "from keras.callbacks import ModelCheckpoint\n",
    "from keras.utils.np_utils import to_categorical"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "X_train,X_val,y_train,y_val= train_test_split(train_X,Y_train,test_size=0.2,random_state=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def split_data(data_):\n",
    "    idx_1 = data_[data_['label']==0].index\n",
    "    idx_2 = data_[data_['label']==1].index\n",
    "    nb_1 = len(data_.loc[idx_1])\n",
    "    nb_2 = len(data_.loc[idx_2])\n",
    "#     print(nb_1)\n",
    "#     print(nb_2)\n",
    "    idx_list_1 = list(idx_1)\n",
    "    idx_list_2 = list(idx_2)\n",
    "    train_x1 = data_.loc[idx_list_1]\n",
    "    train_x2 = data_.loc[idx_list_2]\n",
    "#     print(train_x1.shape)\n",
    "#     print(train_x2.shape)\n",
    "    return train_x1,train_x2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def resample_data(data, number):\n",
    "    idx_1 = data.index\n",
    "    nb_1 = len(idx_1)\n",
    "#     print(nb_1)\n",
    "#     number = int(nb_1 * rate)\n",
    "    idx_1_sub = np.random.choice(idx_1, number)\n",
    "#     print(idx_1_sub)\n",
    "    nb_2 = len(data.loc[idx_1_sub])\n",
    "#     print(nb_2)\n",
    "    idx_list_1 = list(idx_1_sub)\n",
    "    train_1 = data.loc[idx_1_sub]\n",
    "#     print(train_1.shape)\n",
    "    return train_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def concat_data(train_x1, train_x2):\n",
    "    train_data1 = train_x1.drop(['label'],axis =1)\n",
    "    train_y1 = train_x1['label']\n",
    "    \n",
    "    train_data2 = train_x2.drop(['label'],axis =1)\n",
    "    train_y2 = train_x2['label']\n",
    "    \n",
    "    train_data = train_data1.append(train_data2)\n",
    "    train_y = train_y1.append(train_y2)\n",
    "    \n",
    "    return train_data, train_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Program Files\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_3</th>\n",
       "      <th>x_4</th>\n",
       "      <th>x_5</th>\n",
       "      <th>x_6</th>\n",
       "      <th>x_7</th>\n",
       "      <th>x_8</th>\n",
       "      <th>x_9</th>\n",
       "      <th>x_10</th>\n",
       "      <th>x_11</th>\n",
       "      <th>x_12</th>\n",
       "      <th>...</th>\n",
       "      <th>x_1552</th>\n",
       "      <th>x_1553</th>\n",
       "      <th>x_157-99</th>\n",
       "      <th>x_1571</th>\n",
       "      <th>x_1572</th>\n",
       "      <th>x_1573</th>\n",
       "      <th>x_1574</th>\n",
       "      <th>x_15710</th>\n",
       "      <th>x_15711</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7021</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6940</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9168</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9796</th>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10905</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 159 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       x_3  x_4  x_5  x_6  x_7  x_8  x_9  x_10  x_11  x_12  ...    x_1552  \\\n",
       "7021     0    0    0    0    0    0    0     0     0     0  ...         0   \n",
       "6940     0    0    0    0    0    0    0     0     0     0  ...         0   \n",
       "9168     0    0    0    0    0    0    0     0     0     0  ...         1   \n",
       "9796   -99  -99  -99  -99  -99  -99  -99   -99   -99   -99  ...         0   \n",
       "10905    0    0    0    0    1    1    0     0     0     0  ...         0   \n",
       "\n",
       "       x_1553  x_157-99  x_1571  x_1572  x_1573  x_1574  x_15710  x_15711  \\\n",
       "7021        0         1       0       0       0       0        0        0   \n",
       "6940        0         1       0       0       0       0        0        0   \n",
       "9168        0         0       0       1       0       0        0        0   \n",
       "9796        0         0       0       1       0       0        0        0   \n",
       "10905       0         1       0       0       0       0        0        0   \n",
       "\n",
       "       label  \n",
       "7021       0  \n",
       "6940       0  \n",
       "9168       0  \n",
       "9796       0  \n",
       "10905      0  \n",
       "\n",
       "[5 rows x 159 columns]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xx = X_train\n",
    "xx['label'] = y_train\n",
    "\n",
    "xx.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(10439, 159)\n",
      "(442, 159)\n"
     ]
    }
   ],
   "source": [
    "train_x1, train_x2 = split_data(xx)\n",
    "print(train_x1.shape)\n",
    "print(train_x2.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_temp = resample_data(train_x2, 10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "multi_x, multi_y= concat_data(train_temp, train_x1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, min_samples_split=5, min_samples_leaf=4, max_depth=3).fit(multi_x,multi_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "    class 0       0.99      0.75      0.86      2616\n",
      "    class 1       0.11      0.73      0.19       105\n",
      "\n",
      "avg / total       0.95      0.75      0.83      2721\n",
      "\n",
      "0.744151376146789\n"
     ]
    }
   ],
   "source": [
    "predictions1 = gb.predict(X_val)\n",
    "\n",
    "target_names = ['class 0', 'class 1']\n",
    "print(classification_report(y_val, predictions1, target_names=target_names))\n",
    "val_auc = metrics.roc_auc_score(y_val,predictions1)#验证集上的auc值\n",
    "print(val_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "718"
      ]
     },
     "execution_count": 107,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(predictions1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "105"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(y_val)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "gbm = XGBClassifier( n_estimators= 100, max_depth= 2, min_child_weight= 4, gamma=0.9, subsample=0.8, \n",
    "                        colsample_bytree=0.8, objective= 'binary:logistic', nthread= -1, scale_pos_weight=1).fit(multi_x, multi_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "    class 0       0.99      0.74      0.85      2616\n",
      "    class 1       0.10      0.72      0.18       105\n",
      "\n",
      "avg / total       0.95      0.74      0.82      2721\n",
      "\n",
      "0.7323176059414592\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n"
     ]
    }
   ],
   "source": [
    "predictions = gbm.predict(X_val)\n",
    "\n",
    "target_names = ['class 0', 'class 1']\n",
    "print(classification_report(y_val, predictions, target_names=target_names))\n",
    "val_auc = metrics.roc_auc_score(y_val,predictions)#验证集上的auc值\n",
    "print(val_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "754"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(predictions)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clf = SVC(C=0.01, kernel='rbf',probability=True,shrinking=True).fit(multi_x, multi_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 132,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "             precision    recall  f1-score   support\n",
      "\n",
      "    class 0       0.98      0.85      0.91      2616\n",
      "    class 1       0.11      0.47      0.18       105\n",
      "\n",
      "avg / total       0.94      0.84      0.88      2721\n",
      "\n",
      "0.6587920489296637\n"
     ]
    }
   ],
   "source": [
    "predictions2 = clf.predict(X_val)\n",
    "\n",
    "target_names = ['class 0', 'class 1']\n",
    "print(classification_report(y_val, predictions2, target_names=target_names))\n",
    "val_auc = metrics.roc_auc_score(y_val,predictions2)#验证集上的auc值\n",
    "print(val_auc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "439"
      ]
     },
     "execution_count": 133,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.sum(predictions2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[3 0 3 ... 0 1 0]\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "    class 0       0.97      0.89      0.93      2616\n",
      "    class 1       0.12      0.39      0.18       105\n",
      "\n",
      "avg / total       0.94      0.87      0.90      2721\n",
      "\n",
      "0.8665931642778391\n",
      "0.6380897771952818\n",
      "0.7323176059414592\n",
      "340\n"
     ]
    }
   ],
   "source": [
    "pred_sum_sum = predictions + predictions1 + predictions2\n",
    "print(pred_sum_sum)\n",
    "for i in range(2721):\n",
    "    if pred_sum_sum[i] >= 3:\n",
    "        pred_sum_sum[i] = 1\n",
    "    else:\n",
    "        pred_sum_sum[i] = 0\n",
    "\n",
    "target_names = ['class 0', 'class 1']\n",
    "print(classification_report(y_val, pred_sum_sum, target_names=target_names))\n",
    "\n",
    "val_acc = metrics.accuracy_score(y_val,pred_sum_sum)#验证集上的auc值\n",
    "print(val_acc)\n",
    "val_auc = metrics.roc_auc_score(y_val,pred_sum_sum)#验证集上的auc值\n",
    "print(val_auc)\n",
    "val_auc1 = metrics.roc_auc_score(y_val,predictions)#验证集上的auc值\n",
    "print(val_auc1)\n",
    "print(np.sum(pred_sum_sum))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pred_test1 = gb.predict(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Program Files\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n",
      "  if diff:\n"
     ]
    }
   ],
   "source": [
    "pred_test = gbm.predict(test_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pred_sum = pred_test1 + pred_test\n",
    "for i in range(10000):\n",
    "    if pred_sum[i] >= 1:\n",
    "        pred_sum[i] = 1\n",
    "    else:\n",
    "        pred_sum[i] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000,)"
      ]
     },
     "execution_count": 156,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pred_sum.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 157,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.max(pred_sum)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 171)"
      ]
     },
     "execution_count": 158,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>x_3</th>\n",
       "      <th>x_4</th>\n",
       "      <th>x_5</th>\n",
       "      <th>x_6</th>\n",
       "      <th>x_7</th>\n",
       "      <th>x_8</th>\n",
       "      <th>x_9</th>\n",
       "      <th>x_10</th>\n",
       "      <th>x_11</th>\n",
       "      <th>x_12</th>\n",
       "      <th>...</th>\n",
       "      <th>x_1552</th>\n",
       "      <th>x_1553</th>\n",
       "      <th>x_157-99</th>\n",
       "      <th>x_1571</th>\n",
       "      <th>x_1572</th>\n",
       "      <th>x_1573</th>\n",
       "      <th>x_1574</th>\n",
       "      <th>x_15710</th>\n",
       "      <th>x_15711</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>-99</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 171 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   x_3  x_4  x_5  x_6  x_7  x_8  x_9  x_10  x_11  x_12 ...  x_1552  x_1553  \\\n",
       "0  -99  -99  -99  -99  -99  -99  -99   -99   -99   -99 ...       0       0   \n",
       "1  -99  -99  -99  -99  -99  -99  -99   -99   -99   -99 ...       0       0   \n",
       "2    0    0    0    0    0    0    0     0     0     0 ...       0       0   \n",
       "3    0    0    0    0    1    1    0     0     0     0 ...       0       0   \n",
       "4  -99  -99  -99  -99  -99  -99    0     1     1     0 ...       0       0   \n",
       "\n",
       "   x_157-99  x_1571  x_1572  x_1573  x_1574  x_15710  x_15711  y  \n",
       "0         1       0       0       0       0        0        0  0  \n",
       "1         0       0       1       0       0        0        0  0  \n",
       "2         0       0       1       0       0        0        0  0  \n",
       "3         0       0       0       0       1        0        0  0  \n",
       "4         0       1       0       0       0        0        0  0  \n",
       "\n",
       "[5 rows x 171 columns]"
      ]
     },
     "execution_count": 159,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "pred_sum_pd = pd.DataFrame(pred_sum)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 1)"
      ]
     },
     "execution_count": 161,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y = Y_train.append(pred_sum_pd)\n",
    "y.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(15000, 103)"
      ]
     },
     "execution_count": 162,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 103)"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 103)"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xx = pd.concat([x_train,x_test])\n",
    "xx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "xx['y'] = y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25000, 104)"
      ]
     },
     "execution_count": 166,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "xx.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "xx.to_csv('../data/train_xy_all.csv',index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
